{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# coding: utf-8\n",
    "\n",
    "# In[1]:\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import gc\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "import datetime\n",
    "from sklearn.cross_validation import StratifiedKFold\n",
    "\n",
    "# In[2]:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cur time = 2018/09/21 20:10:16\n",
      "(13887, 3251) (12955, 3251)\n",
      "cur time = 2018/09/21 20:10:16\n"
     ]
    }
   ],
   "source": [
    "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
    "train = np.load('../X_train.npy')\n",
    "test = np.load('../X_test.npy')\n",
    "train_labels = np.load('../labels.npy')\n",
    "\n",
    "print train.shape,test.shape\n",
    "print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_cnn_1 = pd.read_pickle('../train_meta_cnn_a.pkl')\n",
    "test_cnn_1 = pd.read_pickle('../test_meta_cnn_a.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_cnn_2 = pd.read_pickle('../train_meta_dilated_cnn_a.pkl')\n",
    "test_cnn_2 = pd.read_pickle('../test_meta_dilated_cnn_a.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_lgb_1 = pd.read_pickle('../train_meta_lgb_1.pkl')\n",
    "test_lgb_1 = pd.read_pickle('../test_meta_lgb_1.pkl')\n",
    "\n",
    "train_lgb_2 = pd.read_pickle('../train_meta_lgb_2.pkl')\n",
    "test_lgb_2 = pd.read_pickle('../test_meta_lgb_2.pkl')\n",
    "\n",
    "train_lgb_3 = pd.read_pickle('../train_meta_lgb_3.pkl')\n",
    "test_lgb_3 = pd.read_pickle('../test_meta_lgb_3.pkl')\n",
    "\n",
    "train_lgb_4 = pd.read_pickle('../train_meta_lgb_4.pkl')\n",
    "test_lgb_4 = pd.read_pickle('../test_meta_lgb_4.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = np.hstack([train,train_cnn_1, train_cnn_2, train_lgb_1, train_lgb_2, train_lgb_3, train_lgb_4])\n",
    "test = np.hstack([test,test_cnn_1, test_cnn_2, test_lgb_1, test_lgb_2, test_lgb_3, test_lgb_4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Times:  0\n",
      "cur time = 2018/09/21 20:12:20\n",
      "FOLD:  0\n",
      "2780 11107\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[100]\ttraining's multi_logloss: 0.70958\tvalid_1's multi_logloss: 0.755308\n",
      "[200]\ttraining's multi_logloss: 0.354997\tvalid_1's multi_logloss: 0.434251\n",
      "[300]\ttraining's multi_logloss: 0.217096\tvalid_1's multi_logloss: 0.326171\n",
      "[400]\ttraining's multi_logloss: 0.152399\tvalid_1's multi_logloss: 0.289922\n",
      "[500]\ttraining's multi_logloss: 0.117076\tvalid_1's multi_logloss: 0.278519\n",
      "[600]\ttraining's multi_logloss: 0.094704\tvalid_1's multi_logloss: 0.277246\n",
      "Early stopping, best iteration is:\n",
      "[598]\ttraining's multi_logloss: 0.0950629\tvalid_1's multi_logloss: 0.277223\n",
      "cur time = 2018/09/21 20:13:46\n",
      "FOLD:  1\n",
      "2779 11108\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[100]\ttraining's multi_logloss: 0.714406\tvalid_1's multi_logloss: 0.746063\n",
      "[200]\ttraining's multi_logloss: 0.360927\tvalid_1's multi_logloss: 0.41752\n",
      "[300]\ttraining's multi_logloss: 0.223406\tvalid_1's multi_logloss: 0.305014\n",
      "[400]\ttraining's multi_logloss: 0.159355\tvalid_1's multi_logloss: 0.265401\n",
      "[500]\ttraining's multi_logloss: 0.123548\tvalid_1's multi_logloss: 0.251562\n",
      "[600]\ttraining's multi_logloss: 0.100453\tvalid_1's multi_logloss: 0.247619\n",
      "[700]\ttraining's multi_logloss: 0.0840086\tvalid_1's multi_logloss: 0.247471\n",
      "Early stopping, best iteration is:\n",
      "[645]\ttraining's multi_logloss: 0.0925202\tvalid_1's multi_logloss: 0.246913\n",
      "cur time = 2018/09/21 20:15:23\n",
      "FOLD:  2\n",
      "2777 11110\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[100]\ttraining's multi_logloss: 0.710447\tvalid_1's multi_logloss: 0.758826\n",
      "[200]\ttraining's multi_logloss: 0.354958\tvalid_1's multi_logloss: 0.436029\n",
      "[300]\ttraining's multi_logloss: 0.216709\tvalid_1's multi_logloss: 0.326181\n",
      "[400]\ttraining's multi_logloss: 0.15243\tvalid_1's multi_logloss: 0.287969\n",
      "[500]\ttraining's multi_logloss: 0.117201\tvalid_1's multi_logloss: 0.275582\n",
      "[600]\ttraining's multi_logloss: 0.0948654\tvalid_1's multi_logloss: 0.273565\n",
      "Early stopping, best iteration is:\n",
      "[578]\ttraining's multi_logloss: 0.0990779\tvalid_1's multi_logloss: 0.273456\n",
      "cur time = 2018/09/21 20:16:47\n",
      "FOLD:  3\n",
      "2776 11111\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[100]\ttraining's multi_logloss: 0.710814\tvalid_1's multi_logloss: 0.757495\n",
      "[200]\ttraining's multi_logloss: 0.356598\tvalid_1's multi_logloss: 0.432203\n",
      "[300]\ttraining's multi_logloss: 0.219223\tvalid_1's multi_logloss: 0.319802\n",
      "[400]\ttraining's multi_logloss: 0.154809\tvalid_1's multi_logloss: 0.280013\n",
      "[500]\ttraining's multi_logloss: 0.118818\tvalid_1's multi_logloss: 0.2661\n",
      "[600]\ttraining's multi_logloss: 0.0962369\tvalid_1's multi_logloss: 0.262496\n",
      "[700]\ttraining's multi_logloss: 0.0801419\tvalid_1's multi_logloss: 0.262299\n",
      "Early stopping, best iteration is:\n",
      "[660]\ttraining's multi_logloss: 0.0860689\tvalid_1's multi_logloss: 0.261957\n",
      "cur time = 2018/09/21 20:18:25\n",
      "FOLD:  4\n",
      "2775 11112\n",
      "Training until validation scores don't improve for 100 rounds.\n",
      "[100]\ttraining's multi_logloss: 0.711242\tvalid_1's multi_logloss: 0.757122\n",
      "[200]\ttraining's multi_logloss: 0.357074\tvalid_1's multi_logloss: 0.432454\n",
      "[300]\ttraining's multi_logloss: 0.219319\tvalid_1's multi_logloss: 0.319717\n",
      "[400]\ttraining's multi_logloss: 0.155336\tvalid_1's multi_logloss: 0.27957\n",
      "[500]\ttraining's multi_logloss: 0.12014\tvalid_1's multi_logloss: 0.264735\n",
      "[600]\ttraining's multi_logloss: 0.0976771\tvalid_1's multi_logloss: 0.260499\n",
      "[700]\ttraining's multi_logloss: 0.0816694\tvalid_1's multi_logloss: 0.260613\n",
      "Early stopping, best iteration is:\n",
      "[625]\ttraining's multi_logloss: 0.0932625\tvalid_1's multi_logloss: 0.260267\n",
      "cur time = 2018/09/21 20:20:08\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "meta_test = np.zeros(shape = (len(test),8))\n",
    "\n",
    "for seed in range(1):\n",
    "    print 'Times: ',seed\n",
    "    print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
    "    skf = StratifiedKFold(train_labels, n_folds=5, shuffle=True, random_state=seed)\n",
    "    for i,(tr_ind,te_ind) in enumerate(skf):\n",
    "        print 'FOLD: ',i\n",
    "        print len(te_ind),len(tr_ind)\n",
    "        X_train,X_train_label = train[tr_ind],train_labels[tr_ind]\n",
    "        X_val,X_val_label = train[te_ind],train_labels[te_ind]\n",
    "        dtrain = lgb.Dataset(X_train,X_train_label) \n",
    "        dval   = lgb.Dataset(X_val,X_val_label, reference = dtrain)   \n",
    "        params = {\n",
    "                'task':'train', \n",
    "                'boosting_type':'gbdt',\n",
    "                'num_leaves': 15,\n",
    "                'objective': 'multiclass',\n",
    "                'num_class':8,\n",
    "                'learning_rate': 0.01,\n",
    "                'feature_fraction': 0.85,\n",
    "                'subsample':0.85,\n",
    "                'num_threads': 54,\n",
    "                'metric':'multi_logloss',\n",
    "                'seed':seed\n",
    "            }  \n",
    "        model = lgb.train(params, dtrain, num_boost_round=100000,valid_sets=[dtrain,dval],verbose_eval=100, early_stopping_rounds=100)  \n",
    "        pred_test = model.predict(test)\n",
    "\n",
    "        #meta_train[te_ind] = pred_val\n",
    "        meta_test += pred_test\n",
    "        print('cur time = ' + str(datetime.datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")))\n",
    "\n",
    "meta_test/=5.0\n",
    "res = pd.DataFrame(meta_test,columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])\n",
    "res.index.name='file_id'\n",
    "res.round(7).to_csv('submit.csv', index = True, header=True)\n",
    "        \n",
    "      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "res.shape\n",
    "res.index = range(1,res.shape[0]+1)\n",
    "res.index.name = 'file_id'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "en =res.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0000000000000004"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "en.sum(axis=1).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "en.to_csv('../fuucccccccck.csv',index=True,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prob0</th>\n",
       "      <th>prob1</th>\n",
       "      <th>prob2</th>\n",
       "      <th>prob3</th>\n",
       "      <th>prob4</th>\n",
       "      <th>prob5</th>\n",
       "      <th>prob6</th>\n",
       "      <th>prob7</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>file_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.002035</td>\n",
       "      <td>0.002127</td>\n",
       "      <td>0.949751</td>\n",
       "      <td>0.009502</td>\n",
       "      <td>0.001805</td>\n",
       "      <td>0.002404</td>\n",
       "      <td>0.005550</td>\n",
       "      <td>0.026825</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.931129</td>\n",
       "      <td>0.002137</td>\n",
       "      <td>0.003289</td>\n",
       "      <td>0.003913</td>\n",
       "      <td>0.002060</td>\n",
       "      <td>0.009254</td>\n",
       "      <td>0.010101</td>\n",
       "      <td>0.038117</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.996000</td>\n",
       "      <td>0.000453</td>\n",
       "      <td>0.000597</td>\n",
       "      <td>0.000630</td>\n",
       "      <td>0.000429</td>\n",
       "      <td>0.000575</td>\n",
       "      <td>0.000560</td>\n",
       "      <td>0.000755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.013627</td>\n",
       "      <td>0.008015</td>\n",
       "      <td>0.018625</td>\n",
       "      <td>0.098806</td>\n",
       "      <td>0.054051</td>\n",
       "      <td>0.092254</td>\n",
       "      <td>0.180903</td>\n",
       "      <td>0.533720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.993833</td>\n",
       "      <td>0.000578</td>\n",
       "      <td>0.001065</td>\n",
       "      <td>0.000852</td>\n",
       "      <td>0.000608</td>\n",
       "      <td>0.000776</td>\n",
       "      <td>0.000779</td>\n",
       "      <td>0.001510</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            prob0     prob1     prob2     prob3     prob4     prob5     prob6  \\\n",
       "file_id                                                                         \n",
       "1        0.002035  0.002127  0.949751  0.009502  0.001805  0.002404  0.005550   \n",
       "2        0.931129  0.002137  0.003289  0.003913  0.002060  0.009254  0.010101   \n",
       "3        0.996000  0.000453  0.000597  0.000630  0.000429  0.000575  0.000560   \n",
       "4        0.013627  0.008015  0.018625  0.098806  0.054051  0.092254  0.180903   \n",
       "5        0.993833  0.000578  0.001065  0.000852  0.000608  0.000776  0.000779   \n",
       "\n",
       "            prob7  \n",
       "file_id            \n",
       "1        0.026825  \n",
       "2        0.038117  \n",
       "3        0.000755  \n",
       "4        0.533720  \n",
       "5        0.001510  "
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "en.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
